title: “A Mixture of Flexible Multivariate Distributions”
output: html_document

In this document, I will compute the posterior distribution for \(\beta_{j}\) using the derivation in the document `next Steps’, where the prior on \(\beta_{j}\) is modeled as a mixture of multivariate normals. In this portion, I train on 10000 gene snp pairs and then test on 40000.

setwd("~/Dropbox/cyclingstatistician/beta_gp_continuous/")
tuto.dir=getwd()
##' @param b.gp.hat PxR matrix of standardized effect sizes across all `R' tissue types,
##' @param se.gp.hat RxR estimated covariance matrix of standardized standard errors, corresponding to the MLE of genotype effect for a given gene-SNP pair in all tissues
##' @param t.stat PxR matrix of t statistics for each gene-snp Pair across all R tissues
##' @param U.0kl RxR prior covariance matrix for posterior.covariance matrix K and weight l
##' @param pi LxK matrix of prior weights estimated from the EM algorithm which correspond to optimal weighting of prior covariance matrix

#b.gp.hat=na.omit(read.table("16008genesnppairs_43tissues_beta.hat.std.txt",header=F,skip=1)[,-c(1,2)])
b.gp.hat=na.omit(read.table("50186simulatedbeta.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]

#se.gp.hat=na.omit(read.table("16008genesnppairs_43tissues_sigma.hat.std.txt",header=F,skip=1)[,-c(1,2)])
se.gp.hat=na.omit(read.table("50186simulatedse.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]

#t.stat=na.omit(read.table("16008genesnppairs_43tissues_t.stat.txt",header=F,skip=1)[,-c(1,2)])
t.stat=na.omit(read.table("50186simulatedt.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]


#p.vals=na.omit(read.table("16008genesnppairs_43tissues_pval.txt",header=F,skip=1)[,-c(1,2)])

L = 5
R=ncol(b.gp.hat)#number of tissues
X.t=as.matrix(t.stat)
X.c=apply(X.t,2,function(x) x-mean(x)) ##Column centered matrix of t statistics
R=ncol(X.t)
M=nrow(X.t)
#colMeans(X.c)

Now, we need to load in the prior matrices which we will try to find the optimal combination. First we load in the K' RxR prior covariance matrices specifying the relative importance of a particular tissue direction in each component. Now, for every prior covariance matrix $U_{k}$, we compute Lstretches’ specifying the width of this distribution. Thus for each stretch \(\omega\), there will be K corresponding covariance matrices.

##' @param function to return the LxK component list of prior covariance matrices
##' @param X.c a matrix of column center tstatistics
##' @param P number of PCs to keep in approximation
##' @param omega.table dataframe of stretches
##' @return return L dimensional list of K dimensional lists where each L,K contains the Lth grid component times the K covariance matrix

omega.table=read.table("~/Dropbox/cyclingstatistician/beta_gp_continuous/omega2.txt")


get.prior.covar.Ukl=function(P,L,R){
  test=list()
  for(l in 1:L){
    test[[l]]=list()
    omega=omega.table[l,]
    test[[l]][[1]]=omega*diag(1,R)# the first covariance matrix will be the 'sopped up' identity
    S=cov2cor((t(X.c)%*%X.c)/M)
    
    test[[l]][[2]]=omega*(S)
    
    
    svd.X=svd(X.c)##perform SVD on sample centered
    #svd.X=svd(X.t)##peform SVD on unsample centered (same result for v)
    v=svd.X$v;u=svd.X$u;d=svd.X$d
    
    cov.pc=1/M*v[,1:P]%*%diag(d[1:P])%*%t(u[,1:P])%*%t(v[,1:P]%*%diag(d[1:P])%*%t(u[,1:P]))##Use the rank P summary representation  
    
    test[[l]][[3]]=omega*cov2cor(cov.pc)}
  return(U.0kl=test)}

Now that we have the prior covariance matrices, we can maximize the likelihood \(L(\pi;\hat\beta_{j},se_{j})\) by computing \(Pr(\hat \beta_{j} | Component k,l)\) for each gene componenet at each each gene SNP pair and then finding the optimal combination among all gene SNP pairs.

##' @return Compute a likelihood using the prior covariance matrix for each gene SNP pair in the rows and componenets in the columns
##' @param b.gp.hat and se.gp.hat matrix of MLES for all J gene snp pairs
##' 
##' @param U.0kl L dimensional list of K dimensional list with prior covairnace matrix for each grid weight, prior covariance pair
#install.packages("SQUAREM") note you'll need to have SQUAREM installed
library("SQUAREM")

L=5
R=5
U.0kl=get.prior.covar.Ukl(P=5,L,R)
K=length(U.0kl[[1]])
L=length(U.0kl)
#U.0kl[[L+1]]=diag(1,R) ##Add the identity matrix to the list 

##J = Number of gene-snp pairs to consider
##' @return likelihood a function to return a J dimensional list of KxL vectors of likelihoods for each component
library("mvtnorm")
likelihood=function(b.gp.hat,se.gp.hat,J=nrow(b.gp.hat))
{
  lik.i=list()
  lapply(seq(1:J),function(j){
    b.mle=b.gp.hat[j,]
    V.gp.hat=diag(se.gp.hat[j,])^2
    
    lik=matrix(NA,nrow=K,ncol=L)
    for(l in 1:L){
      for (k in 1:K){
        lik[k,l]=dmvnorm(x=b.mle, sigma=U.0kl[[l]][[k]] + V.gp.hat)
      }}
    lik.i[[j]] = as.vector(lik) ## concatenates column by column (e.g., [K=1,1],[K=2,1],[K=3,1],[2,1])
    
    
  })}

Here, we load in a precomputed global likelihood as this takes time.

##Likelihood of each component in cols, gene=snp pairs in rows##
global.lik=matrix(unlist(likelihood(b.gp.hat,se.gp.hat)),ncol=K*L,byrow=TRUE)
A="testA"
write.table(global.lik,paste0("global.likNOSFA.",A,"txt"))

Now, to use the EM algorithm:

#install.packages("SQUAREM")
A="testA"
library("SQUAREM")
global.lik=read.table(paste0("global.likNOSFA.",A,"txt"))
pis=mixEM(matrix_lik=global.lik,prior=rep(1,K*L))
names.vec=matrix(NA,nrow=K,ncol=L)
for(l in 1:L){ 
  for(k in 1:K){
    names.vec[k,l]=paste0("k=",k,";l=",l)}}

#write.table((cbind(as.vector(names.vec),pis$pihat)),quote=FALSE,file=paste0("piklhatnoSFA.",A,".txt"))
pi.hat=read.table(file=paste0("piklhatnoSFA.",A,".txt"))
barplot(pis$pihat,names=as.vector(names.vec),main="mixEM estimated pi")

We can see that the majority of the weight is put on the covariance matrix of t statistics, \(X_{t}'X\) and the estimated covariance matrix, \(V_{t,1..K}\lambda^{2} V_{t,1..K}'\) where here I use K = 13. Recall each V is a \(43\) x \(1\) vector.

I compare with naiive iteration, which simply weights the relative likelihood across individuals.

##' @param global.lik Computes the likelihood matrices for each component for each of j gene snp pairs
##' @return global.sum Sums the likelihood for each component across j gene snp pairs
##' @return global.norm Sums the likelihood for all components across all j pairs


library("mvtnorm")

updated.weights=function(b.gp.hat,se.gp.hat,global.lik){
  global.sums=as.matrix(colSums(global.lik))#relative importance of each componenet across all gene-snp pairs
  global.norm=sum(global.sums)
  return(updated.weights=global.sums/global.norm)}

names.vec=matrix(NA,nrow=K,ncol=L)
for(l in 1:L){ 
  for(k in 1:K){
    names.vec[k,l]=paste0("k=",k,";l=",l)}}

x=updated.weights(b.gp.hat,se.gp.hat,global.lik)
barplot(as.vector(x),names=as.vector(names.vec),main="Normalized Likelihood at Each Component")

Here, I’ve plotted the relative importance of each component after one iteration with a uniform prior weight on each \(\pi\).

Recall:

\[L(\beta_{j};k,l) = Pr(\hat{b}_{j} | k,l)\] \[=Pr(\hat{b}_{j}; 0, \omega^{2}_{l} U_{k} + \hat {V})\]

\[w_{jkl} = \frac{\pi_{k,l} L(\beta_{j};k,l)}{\sum_{k,l} \pi_{k,l} L(\beta_{j};k,l)}\]

We can then update our estimate of \(\pi_{k,l}\)

\[\pi_{kl}^{i+1} = \frac{\sum_{j} w_{jkl}}{\sum_{j,k,l} w_{jkl}}\]

Posterior computation

Now that we have the hierarchical weights \(\pi_{k,l}\), we can compute the posterior distribution for each component.

For a given prior covariance matrix, compute posterior covariance and posterior mean. Here, let U.0k.l represent a specific matrix in U.0kl (.e.g, \(U.0kl[[l]][[k]]\))

##' @param U.0k.l let U.0k.l represent a specific matrix in U.0kl (.e.g, U.0kl[[l]][[k]])
##' @return post.b.gpkl.cov ## returns an R*R posterior covariance matrix for jth gene snp pair
##' @return post.b.gpkl.mean return a 1 * R vector of posterior means for a given prior covariance matrix

post.b.gpkl.cov <- function(V.gp.hat.inv, U.0k.l){
  U.gp1kl <- U.0k.l %*% solve(V.gp.hat.inv %*% U.0k.l + diag(nrow(U.0k.l)))
  return(U.gp1kl)
}

post.b.gpkl.mean <- function(b.mle, V.gp.hat.inv, U.gp1kl){
  mu.gp1kl <- U.gp1kl %*% V.gp.hat.inv %*% b.mle
  return(mu.gp1kl)
}

We also need to compute the “posterior weights” corresponding to each prior covariance matrix, which is simply the likelihood evaluated at that componenet times the prior weigth, \(pi_{k,l}\) normalized by the marginal likelihood over all components.

\(p(k=1,l=1|D)=\) \(\frac{p(D|k=1,l=1)*p(k=1,l=1)}{p(D)}\)

##' @param pi.hat = matrix of prior weights
##' @return a vector of posterior weights
pis=pi.hat[,2]
post.weight.func=function(pis,U.0kl,V.gp.hat,b.mle){
  post.weight.num = matrix(NA,nrow=K,ncol=L)
  for(k in 1:K){
    for(l in 1:L){
      wts=matrix(pis,nrow=K,ncol=L)
      pi=wts[k,l]
      post.weight.num[k,l]=pi*dmvnorm(x=b.mle, sigma=U.0kl[[l]][[k]] + V.gp.hat)}
  }
  post.weight=post.weight.num/sum(post.weight.num)
  return(as.vector(post.weight))}

Now, for each gene-snp pair \(j\) and each prior covariance matrix \(U_{0kl}\) I will generate a 43 x 43 posterior covariance matrix and 43 x 1 vector of posterior means.

##' @param U.0kl = l dimensional list of k dimensional list of prior covariance matrices
##' @returm all.means, a J dimensional list of k dimensional lists of L dimensional list of 1xR vectors of posterior means
##' @returm all.covs, a J dimensional list of k dimensional lists of L dimensional list of RxR posterior covariances
##' @returm post.weight.matrix a J x (K*L) matrix of posterior weights coresponding to p(K,L|D) for each gene-snp Pair

##Now we use the test data

b.gp.hat=na.omit(read.table("50186simulatedbeta.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]

#se.gp.hat=na.omit(read.table("16008genesnppairs_43tissues_sigma.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50,000]
se.gp.hat=na.omit(read.table("50186simulatedse.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]

#t.stat=na.omit(read.table("16008genesnppairs_43tissues_t.stat.txt",header=F,skip=1)[,-c(1,2)])
t.stat=na.omit(read.table("50186simulatedt.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]




all.covs=list()
all.means=list()
all.up=list()
all.low=list()
all.null=list()
J=dim(b.gp.hat)[1]
#lapply(seq(1:J),function(j){
for(j in 1:J){
  b.mle=as.vector(t(b.gp.hat[j,]))##turn i into a R x 1 vector
  V.gp.hat=diag(se.gp.hat[j,])^2
  all.covs[[j]]=list()
  all.means[[j]]=list()
  all.up[[j]]=list()
  all.low[[j]]=list()
  all.null[[j]]=list()
  #temp.mean=matrix(NA,nrow=)
  
  V.gp.hat.inv <- solve(V.gp.hat)
  for(k in 1:K){
    all.covs[[j]][[k]]=list()
    all.means[[j]][[k]]=list()
    all.up[[j]][[k]]=list()
    all.low[[j]][[k]]=list()
    all.null[[j]][[k]]=list()
    
    for (l in 1:L){
      all.means[[j]][[k]][[l]]=list()
      all.covs[[j]][[k]][[l]]=list()
      all.up[[j]][[k]][[l]]=list()
      all.low[[j]][[k]][[l]]=list()
      all.null[[j]][[k]][[l]]=list()
      U.gp1kl <- post.b.gpkl.cov(V.gp.hat.inv, U.0kl[[l]][[k]]) ## returns an R*R posterior covariance matrix for jth gene snp pair
      mu.gp1kl <- post.b.gpkl.mean(b.mle, V.gp.hat.inv, U.gp1kl) ##return a 1 * R vector of posterior means for a given prior covariance matrix
      all.means[[j]][[k]][[l]]=mu.gp1kl
      all.covs[[j]][[k]][[l]]=U.gp1kl
      all.up[[j]][[k]][[l]]=rep(0,R)
      all.low[[j]][[k]][[l]]=rep(0,R)
      all.null[[j]][[k]][[l]]=rep(0,R)
      all.sds=sqrt(diag(U.gp1kl))
      for(r in 1:R){
        if(all.sds[r]==0){
          all.up[[j]][[k]][[l]][r]=0#need to figure out how to make the list have individual entries
          all.low[[j]][[k]][[l]][r]=0
          all.null[[j]][[k]][[l]][r]=1}
        else{
          all.up[[j]][[k]][[l]][r]=pnorm(0,mean=mu.gp1kl[r],sd=sqrt(diag(U.gp1kl))[r],lower.tail=F)
          all.low[[j]][[k]][[l]][r]=pnorm(0,mean=mu.gp1kl[r],sd=sqrt(diag(U.gp1kl))[r],lower.tail=T)
          all.null[[j]][[k]][[l]][r]=0}
        
      }
    }
  }
}

Now, for each of the \(J\) gene-snp pairs we generate a matrix of posterior weight matrix. Remember, this will not be tissue specific information, so we need to do it only once and can store in an Jx(K*L) matrix

library("mvtnorm")
post.weight.matrix=matrix(NA,nrow=J,ncol=L*K)
J=dim(b.gp.hat)[1]


for(j in 1:J){
  b.mle=as.vector(t(b.gp.hat[j,]))##turn i into a 43 x 1 vector
  V.gp.hat=diag(se.gp.hat[j,])^2
  pis=pi.hat[,2]
  U.0kl=U.0kl
  post.weight.matrix[j,]=as.vector(post.weight.func(pis,U.0kl,V.gp.hat,b.mle))
}

write.table(post.weight.matrix,paste0("postweightmatrixnoSFA.",A,".txt"))

We can compute the overall posterior mean for each gene-SNP pair across tissues (i.e., the weighted \(R\) dimensional posterior mean vector of \(mu_{j}\)). Here, I do it for 10 gene SNP pairs.

#par(mfrow=c(3,3))

L=5
ls.mat=read.table("lfsr.50186.txt")
apply(ls.mat,2,which.min)
apply(ls.mat,2,which.max)


#post.weight.matrix=as.matrix(read.table("postweightmatrixnoSFA.txt"))

post.means=matrix(NA,nrow=J,ncol=R)
post.ups=matrix(NA,nrow=J,ncol=R)
post.lows=matrix(NA,nrow=J,ncol=R)
post.null=matrix(NA,nrow=J,ncol=R)
lfsr.mat=matrix(NA,nrow=J,ncol=R)


#if(plot=FALSE)
J=nrow(b.gp.hat)

for(x in 1:nrow(b.gp.hat)){
  j=x
  
  
  
  mean.list=all.means[[j]]
  upper.list=all.up[[j]]
  lower.list=all.low[[j]]
  null.list=all.null[[j]]
  
  post.weights=post.weight.matrix[j,]
  post.weight.mat=matrix(post.weights,nrow=K,ncol=L)
  post.weight.mat[is.na(post.weight.mat)] <- 0
  truth[j,]
  temp=NULL
  temp.up=NULL
  temp.low=NULL
  temp.null=NULL
  
  
  for(k in 1:K){
    for(l in 1:L){
      temp=rbind(as.vector(post.weight.mat[k,l]*mean.list[[k]][[l]]),temp)##creates a 6x43 matrix of posterior means, but in [k=3,l=2][k=3,l=1][k=2,l=2][k=2,l=1] etc. (which is ok because we simply sum)
      
      up.row=as.vector(upper.list[[k]][[l]]*post.weight.mat[k,l])
      up.row=t(as.matrix(up.row))
      temp.up=rbind(temp.up,up.row)
      temp.low=rbind(as.vector(post.weight.mat[k,l]*as.matrix(lower.list[[k]][[l]])),temp.low)
      temp.null=rbind(as.vector(post.weight.mat[k,l]*as.matrix(null.list[[k]][[l]])),temp.null)
      
    }
  }
  post.means[x,]=t(colSums(temp))
  post.ups[x,]=t(colSums(temp.up))
  post.lows[x,]=t(colSums(temp.low))
  post.null[x,]=t(colSums(temp.null))
  lfsr.mat[x,]=as.matrix(apply(rbind(post.ups[x,],post.lows[x,]),2,function(x){1-max(x)}))
  
  lfsr=lfsr.mat[x,]
}
# 
write.table(post.means,paste0("post.means.no.sfa.",A,".txt"))
write.table(post.ups,paste0("post.ups.no.sfa.",A,".txt"))
write.table(post.lows,paste0("post.lows.no.sfa.",A,".txt"))
write.table(post.null,paste0("post.null.no.sfa.",A,".txt"))
write.table(lfsr.mat,paste0("lfsr.no.sfa.",A,".txt"))
post.means=as.matrix(read.table(paste0("post.means.no.sfa.",A,".txt")))
post.ups=as.matrix(read.table(paste0("post.ups.no.sfa.",A,".txt")))
post.lows=as.matrix(read.table(paste0("post.lows.no.sfa.",A,".txt")))
post.null=as.matrix(read.table(paste0("post.null.no.sfa.",A,".txt")))
lfsr.mat=as.matrix(read.table(paste0("lfsr.no.sfa.",A,".txt")))


truth=read.table("50186truth.txt")[10001:50000,]

false=which(truth$config==0)
real=which(truth$config!=0)
snps=c(false[1:9],real[1:8])

for(x in 1:length(snps)){
  j=snps[x]
  
  par(mfrow=c(1,3))
  #barplot(post.means[j,],main=paste0("PostTissueMeans,B_",j),col=c(1:43),las=2,names=names[-44])
  b=barplot(abs(post.means[j,]),main=paste0("PostTissueMeans",truth[j,2]),col=c(1:R),las=2,names=names,beside=TRUE,ylim=c(0,max(abs(post.means[j,]))+0.05))
  lfsr=lfsr.mat[j,]
  text(b, abs(post.means[j,]), labels=round(lfsr,2), pos=3, offset=.5)
  legend("topright",legend=round(lfsr,2),col=c(1:R),pch=1,title="lfsr")

  b.mle=t(b.gp.hat[j,])
  a.m=abs(b.mle)
  a.p=abs(post.means[j,])
  low=min(a.m-0.01,a.p-0.01)
  high=max(a.m+0.01,a.p+0.01)
  plot(abs(b.mle),abs(post.means[j,]),main="PostMeanvsMLE",xlim=c(low,high),ylim=c(low,high),col=c(1:R))
  
  true.m=as.numeric(truth[j,c(4:8)])
  plot(abs(true.m),abs(post.means[j,]),col=c(1:R),main="PostMeanvsTrueBeta")
  
  abline(0,1)##concern because the order of magnitude diff, but prior weight heavy on (U.0kl[[2]][[3]]) which has very small avg. variance
  #dev.off()
  
}

par(mfrow=c(2,3))
for( r in seq(1:R)){
  plot(post.null[,r],lfsr.mat[,r],main=paste0("lfsr",r,"vs lfdr", r))
}

In order to compute the factors, we perform the following analysis.

  1. We perform the factor decomposition and use the first \(K\) = 10 matrix approximations to estimate the covariance of X^{t}X. HOwever, rather than add them as we did in PCs, we compute the weights for each componenet separately so that we can quantify the weight on the each ‘Factor Approximation’ of the covariance. Recall that the loading corresponding to factor 1 corresponds to how much of ‘master tissue pattern1’ each tissue carries, and so the entry for that tissue in the \(kth\) covariance matrix approximation will tell you the size of the effect in the direction of this ‘master tissue-k specific pattern’. Similarly, the off-diagonal elements tell us the covariance between the performnace of tissues \(i\) and \(j\) in this direction. See my document section entitled ‘Interpretation.’ I also use the full rank \(K=10\) approximation.
sfa -gen ./centered.t.stats.txt -g 6904 -n 43 -o tri -k 18 (here, we want each row to be maximally loaded on 1 factor, and each factor to correspond to a specific subset of tissues)
sfa -gen ./50186centered.t.stats.txt -g 50186 -n 5 -o tritut -k 20 (here, we want each row to be maximally loaded on 1 factor, and each factor to correspond to a specific subset of tissues)

Then, I will use \(K=18\) factor representations of the covariance matrix of \(X^{t}X\).

##Explore how many factors SNPs are loaded on##
## Training data load

remove(list=ls())

b.gp.hat=na.omit(read.table("50186simulatedbeta.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]

#se.gp.hat=na.omit(read.table("16008genesnppairs_43tissues_sigma.hat.std.txt",header=F,skip=1)[,-c(1,2)])
se.gp.hat=na.omit(read.table("50186simulatedse.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]

#t.stat=na.omit(read.table("16008genesnppairs_43tissues_t.stat.txt",header=F,skip=1)[,-c(1,2)])
t.stat=na.omit(read.table("50186simulatedt.hat.std.txt",header=F,skip=1)[,-c(1,2)])[1:10000,]


#p.vals=na.omit(read.table("16008genesnppairs_43tissues_pval.txt",header=F,skip=1)[,-c(1,2)])

L = 5
R=ncol(b.gp.hat)#number of tissues
X.t=as.matrix(t.stat)
X.c=apply(X.t,2,function(x) x-mean(x)) ##Column centered matrix of t statistics
R=ncol(X.t)
M=nrow(X.t)


library("mvtnorm")
lambda=as.matrix(read.table("tritut_lambda.out"))
factor=as.matrix(read.table("tritut_F.out"))

hist(apply(lambda,1,function(x){length(x[x!=0])}),main="Number of Factors A SNP is loaded on",xlab="NumberofFactors")

omega.table=read.table("~/Dropbox/cyclingstatistician/beta_gp_continuous/omega2.txt")




#P=25
P=5 ##maximally 5 PCs
get.prior.covar.Ukl=function(P,L,R,F=18){
  test=list()
  for(l in 1:L){
    test[[l]]=list()
    omega=omega.table[l,]
    test[[l]][[1]]=omega*diag(1,R)# the first covariance matrix will be the 'sopped up' identity
    test[[l]][[2]]=omega*cov2cor((t(X.c)%*%X.c)/M)
    
    
    svd.X=svd(X.c)##perform SVD on sample centered
    #svd.X=svd(X.t)##peform SCD on unsample centered (same result for v)
    v=svd.X$v;u=svd.X$u;d=svd.X$d
    
    cov.pc=1/M*v[,1:P]%*%diag(d[1:P])%*%t(u[,1:P])%*%t(v[,1:P]%*%diag(d[1:P])%*%t(u[,1:P]))##Use the rank P summary representation, also has shrinkage implications 
    
    
    #cov.pc=1/M*v[,1:P]%*%diag(d[1:P])%*%t(v[,1:P])##Use the rank P summary representation  
    
    test[[l]][[3]]=omega*cov2cor(cov.pc)
    for(f in 1:F){
      load=as.matrix(lambda[,f])
      fact=t(as.matrix(factor[f,]))
      rank.prox=load%*%fact
      a=cov2cor(1/M*(t(rank.prox)%*% rank.prox))
      a[is.nan(a)] = 0
      test[[l]][[f+3]]=omega*a
    }
    full.rank=lambda%*%factor
    b=cov2cor(1/M*(t(full.rank)%*%full.rank))
    b[is.nan(b)]=0
    test[[l]][[F+4]]=omega*b}##Note that this is NOT equivalent to the original covariance matrix because  the maximum number of cactors is limited to 10
  
  return(U.0kl=test)}

U.0kl=get.prior.covar.Ukl(P=5,L=5,R=5,F=18)
(K=length(U.0kl[[1]]))
## [1] 22
(L=length(U.0kl))
## [1] 5

Now, compute the additional likielihoods

library("mvtnorm")
likelihood=function(b.gp.hat,se.gp.hat,J=nrow(b.gp.hat))
{
  lik.i=list()
  lapply(seq(1:J),function(j){
    b.mle=b.gp.hat[j,]
    V.gp.hat=diag(se.gp.hat[j,])^2
    
    lik=matrix(NA,nrow=K,ncol=L)
    for(l in 1:L){
      for (k in 1:K){
        lik[k,l]=dmvnorm(x=b.mle, sigma=U.0kl[[l]][[k]] + V.gp.hat)
      }}
    lik.i[[j]] = as.vector(lik) ## concatenates column by column (e.g., [K=1,1],[K=2,1],[K=3,1],[2,1])
    
    
  })}
##Likelihood of each component in cols, gene=snp pairs in rows##

global.lik=matrix(unlist(likelihood(b.gp.hat,se.gp.hat)),ncol=K*L,byrow=TRUE)
write.table(global.lik,paste0("global.likSFA.",A,".txt"))
A="test"
global.lik=read.table(paste0("global.likSFA.",A,".txt"))

Now, to use the EM algorithm:

#install.packages("SQUAREM")
library("SQUAREM")
pis=mixEM(matrix_lik=global.lik,prior=rep(1,K*L))
names.vec=matrix(NA,nrow=K,ncol=L)
for(l in 1:L){ 
  for(k in 1:K){
    names.vec[k,l]=paste0("k=",k,";l=",l)}}

#write.table((cbind(as.vector(names.vec),pis$pihat)),quote=FALSE,file=paste0("piklhat",A,".txt"))
pi.hat=read.table(file=paste0("piklhat",A,".txt"))
barplot(pis$pihat,names=as.vector(names.vec),main="mixEM estimated pi")

We can see that the majority of the weight is put on the covariance matrix of t statistics, \(X_{t}'X\) and the estimated covariance matrix, \(V_{t,1..K}\lambda^{2} V_{t,1..K}'\) where here I use K = 13. Recall each V is a \(43\) x \(1\) vector.

I compare with naiive iteration, which simply weights the relative likelihood across individuals.

par(mfrow=c(3,2))
for(l in 1:L){ 
  i=l-1
  start=(i*K+1)
  end=(l*K)
  barplot(pi.hat[start:end,2],main="mixEM estimated pi",names=names.vec[start:end],las=2,col=c(1:14))
}

We now need to compute the posterior means for the test set.This is the product of the posterior mean for each component, and we know that for each componenet:

\[\mu_{1}=\left(\boldsymbol\Sigma_0^{-1} + \Sigma^{-1}\right)^{-1}\left(\boldsymbol\Sigma^{-1} \mathbf{\hat\beta} \right)\]

\[U_{1}= \left(\boldsymbol\Sigma_0^{-1} + n\boldsymbol\Sigma^{-1}\right)^{-1}\]

library("mvtnorm")


b.gp.hat=na.omit(read.table("50186simulatedbeta.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]

#se.gp.hat=na.omit(read.table("16008genesnppairs_43tissues_sigma.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50,000]
se.gp.hat=na.omit(read.table("50186simulatedse.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]

#t.stat=na.omit(read.table("16008genesnppairs_43tissues_t.stat.txt",header=F,skip=1)[,-c(1,2)])
t.stat=na.omit(read.table("50186simulatedt.hat.std.txt",header=F,skip=1)[,-c(1,2)])[10001:50000,]


post.b.gpkl.cov <- function(V.gp.hat.inv, U.0k.l){
  U.gp1kl <- U.0k.l %*% solve(V.gp.hat.inv %*% U.0k.l + diag(nrow(U.0k.l)))
  return(U.gp1kl)
}

post.b.gpkl.mean <- function(b.mle, V.gp.hat.inv, U.gp1kl){
  mu.gp1kl <- U.gp1kl %*% V.gp.hat.inv %*% b.mle
  return(mu.gp1kl)
}



(K=length(U.0kl[[1]]))
(L=length(U.0kl))
all.covs=list()
all.means=list()
all.up=list()
all.low=list()
all.null=list()
J=dim(b.gp.hat)[1]
#lapply(seq(1:J),function(j){
for(j in 1:J){
  b.mle=as.vector(t(b.gp.hat[j,]))##turn i into a R x 1 vector
  V.gp.hat=diag(se.gp.hat[j,])^2
  all.covs[[j]]=list()
  all.means[[j]]=list()
  all.up[[j]]=list()
  all.low[[j]]=list()
  all.null[[j]]=list()
  #temp.mean=matrix(NA,nrow=)
  
  V.gp.hat.inv <- solve(V.gp.hat)
  for(k in 1:K){
    all.covs[[j]][[k]]=list()
    all.means[[j]][[k]]=list()
    all.up[[j]][[k]]=list()
    all.low[[j]][[k]]=list()
    all.null[[j]][[k]]=list()
    
    for (l in 1:L){
      all.means[[j]][[k]][[l]]=list()
      all.covs[[j]][[k]][[l]]=list()
      all.up[[j]][[k]][[l]]=list()
      all.low[[j]][[k]][[l]]=list()
      all.null[[j]][[k]][[l]]=list()
      U.gp1kl <- post.b.gpkl.cov(V.gp.hat.inv, U.0kl[[l]][[k]]) ## returns an R*R posterior covariance matrix for jth gene snp pair
      mu.gp1kl <- post.b.gpkl.mean(b.mle, V.gp.hat.inv, U.gp1kl) ##return a 1 * R vector of posterior means for a given prior covariance matrix
      all.means[[j]][[k]][[l]]=mu.gp1kl
      all.covs[[j]][[k]][[l]]=U.gp1kl
      all.up[[j]][[k]][[l]]=rep(0,R)
      all.low[[j]][[k]][[l]]=rep(0,R)
      all.null[[j]][[k]][[l]]=rep(0,R)
      all.sds=sqrt(diag(U.gp1kl))
      for(r in 1:R){
        if(all.sds[r]==0){
          all.up[[j]][[k]][[l]][r]=0#need to figure out how to make the list have individual entries
          all.low[[j]][[k]][[l]][r]=0
          all.null[[j]][[k]][[l]][r]=1}
        else{
          all.up[[j]][[k]][[l]][r]=pnorm(0,mean=mu.gp1kl[r],sd=sqrt(diag(U.gp1kl))[r],lower.tail=F)
          all.low[[j]][[k]][[l]][r]=pnorm(0,mean=mu.gp1kl[r],sd=sqrt(diag(U.gp1kl))[r],lower.tail=T)
          all.null[[j]][[k]][[l]][r]=0}
        
      }
    }
  }
}
(J=nrow(b.gp.hat))
post.weight.func=function(pis,U.0kl,V.gp.hat,b.mle){
  post.weight.num = matrix(NA,nrow=K,ncol=L)
  for(k in 1:K){
    for(l in 1:L){
      wts=matrix(pis,nrow=K,ncol=L)
      pi=wts[k,l]
      post.weight.num[k,l]=pi*dmvnorm(x=b.mle, sigma=U.0kl[[l]][[k]] + V.gp.hat)}
  }
  post.weight=post.weight.num/sum(post.weight.num)
  return(as.vector(post.weight))}

library("mvtnorm")
post.weight.matrix=matrix(NA,nrow=J,ncol=L*K)
J=dim(b.gp.hat)[1]


for(j in 1:J){
  b.mle=as.vector(t(b.gp.hat[j,]))##turn i into a 43 x 1 vector
  V.gp.hat=diag(se.gp.hat[j,])^2
  pis=pi.hat[,2]
  U.0kl=U.0kl
  post.weight.matrix[j,]=as.vector(post.weight.func(pis,U.0kl,V.gp.hat,b.mle))
}

write.table(post.weight.matrix,paste0("post.weightmatrix",A,".txt"))
L=5
K=22




post.means=matrix(NA,nrow=J,ncol=R)
post.ups=matrix(NA,nrow=J,ncol=R)
post.lows=matrix(NA,nrow=J,ncol=R)
post.null=matrix(NA,nrow=J,ncol=R)
lfsr.mat=matrix(NA,nrow=J,ncol=R)


#if(plot=FALSE)
(J=nrow(b.gp.hat))

post.weight.matrix=as.matrix(read.table(paste0("post.weightmatrix",A,".txt")))
(K=length(U.0kl[[1]]))
(L=length(U.0kl))
for(x in 1:nrow(b.gp.hat)){
  j=x
  mean.list=all.means[[j]]
  upper.list=all.up[[j]]
  lower.list=all.low[[j]]
  null.list=all.null[[j]]
  
  post.weights=post.weight.matrix[j,]
  post.weight.mat=matrix(post.weights,nrow=K,ncol=L)
  post.weight.mat[is.na(post.weight.mat)] <- 0
  truth[j,]
  temp=NULL
  temp.up=NULL
  temp.low=NULL
  temp.null=NULL
  
  
  for(k in 1:K){
    for(l in 1:L){
      post.weight.mat[k,l]=as.numeric(post.weight.mat[k,l])
      
      temp=rbind(as.vector(as.numeric(post.weight.mat[k,l])*mean.list[[k]][[l]]),temp)##creates a 6x43 matrix of posterior means, but in [k=3,l=2][k=3,l=1][k=2,l=2][k=2,l=1] etc. (which is ok because we simply sum)
      
      up.row=as.vector(upper.list[[k]][[l]]*as.numeric(post.weight.mat[k,l]))
      up.row=t(as.matrix(up.row))
      temp.up=rbind(temp.up,up.row)
      temp.low=rbind(as.vector(as.numeric(post.weight.mat[k,l])*as.matrix(lower.list[[k]][[l]])),temp.low)
      temp.null=rbind(as.vector(as.numeric(post.weight.mat[k,l])*as.matrix(null.list[[k]][[l]])),temp.null)
      
    }
  }
  post.means[x,]=t(colSums(temp))
  post.ups[x,]=t(colSums(temp.up))
  post.lows[x,]=t(colSums(temp.low))
  post.null[x,]=t(colSums(temp.null))
  lfsr.mat[x,]=as.matrix(apply(rbind(post.ups[x,],post.lows[x,]),2,function(x){1-max(x)}))
  
  lfsr=lfsr.mat[x,]
  print(j)
}

##define where you will store the test data set

A="testA"
write.table(post.means,paste0("post.means.with.sfa",A,".txt"))
 write.table(post.ups,paste0("post.ups.with.sfa",A,".txt"))
 write.table(post.lows,paste0("post.lows.with.sfa",A,".txt"))
 write.table(post.null,paste0("post.null.with.sfa",A,".txt"))
 write.table(lfsr.mat,paste0("lfsr.withsfa",A,".txt"))

And finally, we can plot the posterior means and the lsfr for each tissue from the test set.

A="testA"
post.means=as.matrix(read.table(paste0("post.means.with.sfa",A,".txt")))
post.ups=as.matrix(read.table(paste0("post.ups.with.sfa",A,".txt")))
post.lows=as.matrix(read.table(paste0("post.lows.with.sfa",A,".txt")))
post.null=as.matrix(read.table(paste0("post.null.with.sfa",A,".txt")))
lfsr.mat=as.matrix(read.table(paste0("lfsr.withsfa",A,".txt")))



truth=read.table("50186truth.txt")[10001:50000,]
false=which(truth$config==0)
real=which(truth$config!=0)
snps=c(false[1:9],real[1:8])


files=NULL
for(i in 1:R){files[i]=paste0("out_eqtlbma_sumstats_tissue",i,".txt.gz")}
                                                     

names=NULL
for(i in 1:length(files)){
  a=strsplit(files[i], '[_]')[[1]][4]
  names[i]=(strsplit(a, '[.]')[[1]][1])
}



for(x in 1:length(snps)){
j=snps[x]
#pdf(paste0("postmeans",truth[j,2],".pdf"))
par(mfrow=c(1,3))
#barplot(post.means[j,],main=paste0("PostTissueMeans,B_",j),col=c(1:43),las=2,names=names[-44])
b=barplot(abs(post.means[j,]),main=paste0("PostTissueMeans,B_",truth[j,2]),col=c(1:R),las=2,names=names,beside=TRUE,ylim=c(0,max(abs(post.means[j,]))+0.05))
lfsr=lfsr.mat[j,]
text(b, abs(post.means[j,]), labels=round(lfsr,2), pos=3, offset=.5)
legend("topright",legend=round(lfsr,2),col=c(1:R),pch=1,title="lfsr")

b.mle=t(b.gp.hat[j,])
a.m=abs(b.mle)
a.p=abs(post.means[j,])
low=min(a.m-0.01,a.p-0.01)
high=max(a.m+0.01,a.p+0.01)
plot(abs(b.mle),abs(post.means[j,]),main="PostMeanvsMLE",xlim=c(low,high),ylim=c(low,high),col=c(1:R))

true.m=as.numeric(truth[j,c(4:8)])
plot(abs(true.m),abs(post.means[j,]),col=c(1:R),main="PostMeanvsTrueBeta")
                                                                   
abline(0,1)##concern because the order of magnitude diff, but prior weight heavy on (U.0kl[[2]][[3]]) which has very small avg. variance
#dev.off()


}

# pdf("plotalllfsr.pdf")
# par(mfrow=c(5,5))
# for(r in seq(1:R)){
#   for (p in seq(1:R)){
#     plot(lfsr.mat[,p],lfsr.mat[,r])
#     }
#   }
# dev.off()
# 
# write.table(post.null,"lfdr.txt")

par(mfrow=c(2,3))
for( r in seq(1:R)){
  plot(post.null[,r],lfsr.mat[,r],main=paste0("lfsr",r,"vs lfdr", r))
}

lfsr1=read.table(as.matrix(paste0("post.means.no.sfa.",A,"txt")))

par(mfrow=c(2,3))

 for( r in seq(1:R)){
   plot(lfsr.mat[,r],lfsr1[,r],main=paste0("lfsr.noSFA",r,"vs lfsr.SFA", r),xlab="lfsrwithSFA",ylab="lfsr.noSFA")
 }